Import COVID Data
url_in <- "https://github.com/CSSEGISandData/COVID-19/raw/refs/heads/master/csse_covid_19_data/csse_covid_19_time_series/"
filenames <-
c("time_series_19-covid-Confirmed.csv",
"time_series_covid19_confirmed_US.csv",
"time_series_covid19_confirmed_global.csv",
"time_series_covid19_deaths_US.csv",
"time_series_covid19_deaths_global.csv")
urls <- str_c(url_in, filenames)
global_cases <- read_csv(urls[3])
## Rows: 289 Columns: 1147
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Province/State, Country/Region
## dbl (1145): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
global_deaths <- read_csv(urls[5])
## Rows: 289 Columns: 1147
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Province/State, Country/Region
## dbl (1145): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
us_cases <- read_csv(urls[2])
## Rows: 3342 Columns: 1154
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): iso2, iso3, Admin2, Province_State, Country_Region, Combined_Key
## dbl (1148): UID, code3, FIPS, Lat, Long_, 1/22/20, 1/23/20, 1/24/20, 1/25/20...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
us_deaths <- read_csv(urls[4])
## Rows: 3342 Columns: 1155
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): iso2, iso3, Admin2, Province_State, Country_Region, Combined_Key
## dbl (1149): UID, code3, FIPS, Lat, Long_, Population, 1/22/20, 1/23/20, 1/24...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Create tidy data sets
global_cases_wrangle <- global_cases %>%
pivot_longer(cols = -c(`Province/State`, `Country/Region`, Lat, Long),
names_to = "date",
values_to = "cases") %>%
mutate(date = mdy(date)) %>%
select(-c(Lat, Long))
global_deaths_wrangle <- global_deaths %>%
pivot_longer(cols = -c(`Province/State`, `Country/Region`, Lat, Long),
names_to = "date",
values_to = "deaths") %>%
mutate(date = mdy(date)) %>%
select(-c(Lat, Long))
# filter(deaths != 0)
us_cases_wrangle <- us_cases %>%
select(-c(UID, iso2, iso3,code3, FIPS, Lat, Long_)) %>%
rename(City = Admin2, State = Province_State, Country = Country_Region) %>%
pivot_longer(cols = -c(City, State, Country, Combined_Key),
names_to = "date",
values_to = "cases") %>%
mutate(date = mdy(date))
us_deaths_wrangle <- us_deaths %>%
select(-c(UID, iso2, iso3,code3, FIPS, Lat, Long_)) %>%
rename(City = Admin2, State = Province_State, Country = Country_Region) %>%
pivot_longer(cols = -c(City, State, Country, Combined_Key, Population),
names_to = "date",
values_to = "deaths") %>%
mutate(date = mdy(date))
# filter(deaths != 0)
global <- global_cases_wrangle %>%
full_join(global_deaths_wrangle)
## Joining with `by = join_by(`Province/State`, `Country/Region`, date)`
us <- us_cases_wrangle %>%
full_join(us_deaths_wrangle, by = c("City", "State", "Country", "Combined_Key", "date"))
The visualization of observed daily cases and deaths in the United States reveals critical trends in the progression of the COVID-19 pandemic. Peaks correspond to significant waves of infections, driven by new variants, holiday gatherings, or waning immunity. The lag between case surges and death spikes highlights the progression of the disease and its impact on healthcare resources.
The ARIMA model provided a reliable forecast of daily new deaths in the U.S., capturing seasonality and trend changes. Such forecasts can assist first responders by:
# Load the dataset
data <- us
# Convert date to Date format
data$date <- as.Date(data$date, format = "%m/%d/%y")
# Ensure daily new cases and new deaths calculation if they are cumulative
data <- data %>%
group_by(Country, State) %>%
arrange(date) %>%
mutate(
new_cases = cases - lag(cases, default = 0),
new_deaths = deaths - lag(deaths, default = 0)
) %>%
ungroup()
# Filter the dataset for the US starting from 2020 and calculate daily new deaths and new cases
us_data <- data %>%
filter(Country == "US", date >= as.Date("2020-01-01")) %>%
group_by(date) %>%
summarize(
Total_New_Deaths = sum(new_deaths, na.rm = TRUE),
Total_New_Cases = sum(new_cases, na.rm = TRUE),
.groups = 'drop'
)
# Prepare data for faceting
us_data_long <- us_data %>%
pivot_longer(
cols = c(Total_New_Cases, Total_New_Deaths),
names_to = "Metric",
values_to = "Value"
)
# Visualize the observed data with facets
library(ggplot2)
ggplot(us_data_long, aes(x = date, y = Value, color = Metric)) +
geom_line() +
facet_wrap(~Metric, ncol = 1, scales = "free_y") +
ggtitle("Observed Daily New Cases and Deaths in the US") +
xlab("Date") +
ylab("Count") +
theme_minimal() +
theme(legend.position = "none")
# Fit the ARIMA model for new deaths
library(forecast)
arima_model_deaths <- auto.arima(us_data$Total_New_Deaths, seasonal = TRUE, stepwise = TRUE, approximation = FALSE)
# Print the model summary
summary(arima_model_deaths)
## Series: us_data$Total_New_Deaths
## ARIMA(4,1,4)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ma1 ma2 ma3 ma4
## 0.3354 -0.0274 -0.6808 -0.1615 -1.1922 0.1871 0.8263 -0.6656
## s.e. 0.0516 0.0581 0.0535 0.0371 0.0451 0.0820 0.0793 0.0426
##
## sigma^2 = 177.1: log likelihood = -4574.06
## AIC=9166.11 AICc=9166.27 BIC=9211.48
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.05854253 13.25594 6.22893 NaN Inf 0.7061886 0.006993717
# Forecast the next 30 days for new deaths
forecast_deaths <- forecast(arima_model_deaths, h = 30)
# Create a dataframe for the forecast with actual dates
forecast_deaths_df <- data.frame(
Date = seq(max(us_data$date) + 1, by = "day", length.out = 30),
Forecast = forecast_deaths$mean,
Lower_80 = forecast_deaths$lower[, 1],
Upper_80 = forecast_deaths$upper[, 1],
Lower_95 = forecast_deaths$lower[, 2],
Upper_95 = forecast_deaths$upper[, 2]
)
# Plot the forecast with actual dates
library(ggplot2)
ggplot() +
geom_line(data = us_data, aes(x = date, y = Total_New_Deaths, color = "Observed")) +
geom_line(data = forecast_deaths_df, aes(x = Date, y = Forecast, color = "Forecast")) +
geom_ribbon(data = forecast_deaths_df, aes(x = Date, ymin = Lower_95, ymax = Upper_95), fill = "blue", alpha = 0.2) +
geom_ribbon(data = forecast_deaths_df, aes(x = Date, ymin = Lower_80, ymax = Upper_80), fill = "lightblue", alpha = 0.4) +
ggtitle("ARIMA Forecast for Daily New Deaths in the US") +
xlab("Date") +
ylab("New Deaths") +
scale_y_continuous(labels = scales::comma) +
theme_minimal() +
theme(legend.title = element_blank())
# Load the dataset
data <- us
# Convert date to Date format
data$date <- as.Date(data$date, format = "%m/%d/%y")
# Ensure cumulative cases and deaths are converted to daily values
data <- data %>%
group_by(Country, State) %>%
arrange(date) %>%
mutate(
daily_cases = cases - lag(cases, default = 0),
daily_deaths = deaths - lag(deaths, default = 0)
) %>%
ungroup()
# Filter the dataset for the US starting from 2020 and calculate cumulative cases and deaths
state_cumulative_data <- data %>%
filter(Country == "US", date >= as.Date("2020-01-01")) %>%
group_by(State, date) %>%
summarize(
Cumulative_Cases = sum(cases, na.rm = TRUE),
Cumulative_Deaths = sum(deaths, na.rm = TRUE),
Population = signif(max(Population, na.rm = TRUE), 6),
.groups = 'drop'
) %>%
mutate(
Cases_Percentage = round((Cumulative_Cases / Population) * 100, 2),
Deaths_Percentage = round((Cumulative_Deaths / Population) * 100, 2)
)
# Get US states map data
us_states <- map_data("state")
# Get outer boundary of the United States
us_outer_boundary <- us_states %>%
group_by(group) %>%
filter(n() > 1)
# Merge COVID data with US states map data
state_cumulative_data <- state_cumulative_data %>%
mutate(State = tolower(State)) # Ensure state names are lowercase to match map data
merged_data <- left_join(us_states, state_cumulative_data, by = c("region" = "State"))
## Warning in left_join(us_states, state_cumulative_data, by = c(region = "State")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Create the base heatmap visualization
us_map <- ggplot(merged_data, aes(x = long, y = lat, group = group, fill = Cases_Percentage)) +
geom_polygon(color = "white") + # Remove internal state borders
geom_path(data = us_outer_boundary, aes(x = long, y = lat, group = group), color = "white", size = 0.7, inherit.aes = FALSE) + # Add outer US outline
scale_fill_gradientn(
name = "% of Population",
colors = c("white", "yellow", "lightblue", "lightgreen", "orange", "red"),
values = scales::rescale(c(0, 0.01, 0.05, 0.1, 0.5, 1)),
na.value = "grey50" # Handle missing values
) +
labs(
title = "Cumulative Cases as % of Population by State: {frame_time}",
subtitle = "Progression Over Time",
caption = "Source: US Dataset"
) +
coord_fixed(1.3) + # Fix aspect ratio
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 12, hjust = 0.5),
legend.position = "right"
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Add animation to the heatmap over time
animated_map <- us_map +
transition_time(date) + # Animate by day
labs(title = "Cumulative Cases as % of Population by State: {frame_time}")
# Render the animation with an end pause
animate(animated_map, nframes = 100, fps = 5, end_pause = 20, renderer = gifski_renderer())
# Save the animation as a GIF
anim_save("covid_US_newcases_heatmap_percentage_animation.gif", animation = last_animation())
# Load the dataset
data <- us
# Convert date to Date format
data$date <- as.Date(data$date, format = "%m/%d/%y")
# Ensure cumulative cases and deaths are converted to daily values
data <- data %>%
group_by(Country, State) %>%
arrange(date) %>%
mutate(
daily_cases = cases - lag(cases, default = 0),
daily_deaths = deaths - lag(deaths, default = 0)
) %>%
ungroup()
# Filter the dataset for the US starting from 2020 and calculate cumulative cases and deaths
state_cumulative_data <- data %>%
filter(Country == "US", date >= as.Date("2020-01-01")) %>%
group_by(State, date) %>%
summarize(
Cumulative_Cases = sum(cases, na.rm = TRUE),
Cumulative_Deaths = sum(deaths, na.rm = TRUE),
Population = signif(max(Population, na.rm = TRUE), 6),
.groups = 'drop'
) %>%
mutate(
Cases_Percentage = round((Cumulative_Cases / Population) * 100, 2),
Deaths_Percentage = round((Cumulative_Deaths / Population) * 100, 2)
)
# Get US states map data
us_states <- map_data("state")
# Get outer boundary of the United States
us_outer_boundary <- us_states %>%
group_by(group) %>%
filter(n() > 1)
# Merge COVID data with US states map data
state_cumulative_data <- state_cumulative_data %>%
mutate(State = tolower(State)) # Ensure state names are lowercase to match map data
merged_data <- left_join(us_states, state_cumulative_data, by = c("region" = "State"))
## Warning in left_join(us_states, state_cumulative_data, by = c(region = "State")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Create the base heatmap visualization
us_map <- ggplot(merged_data, aes(x = long, y = lat, group = group, fill = Deaths_Percentage)) +
geom_polygon(color = "white") + # Remove internal state borders
geom_path(data = us_outer_boundary, aes(x = long, y = lat, group = group), color = "white", size = 0.7, inherit.aes = FALSE) + # Add outer US outline
scale_fill_gradientn(
name = "% of Population",
colors = c("white", "yellow", "lightblue", "lightgreen", "orange", "red"),
values = scales::rescale(c(0, 0.01, 0.05, 0.1, 0.5, 1)),
na.value = "grey50" # Handle missing values
) +
labs(
title = "Cumulative Deaths as % of Population by State: {frame_time}",
subtitle = "Progression Over Time",
caption = "Source: US Dataset"
) +
coord_fixed(1.3) + # Fix aspect ratio
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 12, hjust = 0.5),
legend.position = "right"
)
# Add animation to the heatmap over time
animated_map <- us_map +
transition_time(date) + # Animate by day
labs(title = "Cumulative Deaths as % of Population by State: {frame_time}")
# Render the animation with an end pause
animate(animated_map, nframes = 100, fps = 5, end_pause = 20, renderer = gifski_renderer())
# Save the animation as a GIF
anim_save("covid_US_heatmap_cumulative_deaths_percentage_animation.gif", animation = last_animation())
The ARIMA forecast for global data demonstrated its adaptability to larger datasets. Potential applications include:
# Load the dataset
data <- global
# Convert date to Date format
data$date <- as.Date(data$date)
# Ensure daily new cases and new deaths calculation if they are cumulative
data <- data %>%
arrange(date) %>%
group_by(`Country/Region`, `Province/State`) %>%
mutate(
new_cases = cases - lag(cases, default = 0),
new_deaths = deaths - lag(deaths, default = 0)
) %>%
ungroup()
# Check for unexpected values in new cases and new deaths
data <- data %>%
mutate(
new_cases = ifelse(new_cases < 0, NA, new_cases), # Remove negative values
new_deaths = ifelse(new_deaths < 0, NA, new_deaths) # Remove negative values
)
# Filter the dataset globally starting from 2020 and calculate daily new deaths and new cases
global_data <- data %>%
filter(date >= as.Date("2020-01-01")) %>%
group_by(date) %>%
summarize(
Total_New_Deaths = sum(new_deaths, na.rm = TRUE),
Total_New_Cases = sum(new_cases, na.rm = TRUE),
.groups = 'drop'
)
# Validate the aggregation results
print(summary(global_data$Total_New_Deaths))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 2180 5849 6028 8773 60903
print(summary(global_data$Total_New_Cases))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100 265196 476495 592793 677628 4083281
# Prepare data for faceting
global_data_long <- global_data %>%
pivot_longer(
cols = c(Total_New_Cases, Total_New_Deaths),
names_to = "Metric",
values_to = "Value"
)
# Visualize the observed data with facets
library(ggplot2)
ggplot(global_data_long, aes(x = date, y = Value, color = Metric)) +
geom_line() +
facet_wrap(~Metric, ncol = 1, scales = "free_y") +
ggtitle("Observed Daily New Cases and Deaths Globally") +
xlab("Date") +
ylab("Count") +
scale_y_continuous(labels = scales::comma) +
theme_minimal() +
theme(legend.position = "none")
# Fit the ARIMA model for new deaths
library(forecast)
arima_model_deaths <- auto.arima(global_data$Total_New_Deaths, seasonal = TRUE, stepwise = TRUE, approximation = FALSE)
# Print the model summary
summary(arima_model_deaths)
## Series: global_data$Total_New_Deaths
## ARIMA(5,1,2)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 ma2
## 0.2076 -0.4615 -0.3542 -0.2962 -0.2818 -1.0503 0.5850
## s.e. 0.0738 0.0367 0.0306 0.0298 0.0345 0.0672 0.0872
##
## sigma^2 = 4916807: log likelihood = -10416.19
## AIC=20848.38 AICc=20848.51 BIC=20888.7
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 2.871213 2209.614 807.6754 -16.43327 30.38921 0.6818144
## ACF1
## Training set -0.005524948
# Forecast the next 30 days for new deaths
forecast_deaths <- forecast(arima_model_deaths, h = 30)
# Create a dataframe for the forecast with actual dates
forecast_deaths_df <- data.frame(
Date = seq(max(global_data$date) + 1, by = "day", length.out = 30),
Forecast = forecast_deaths$mean,
Lower_80 = forecast_deaths$lower[, 1],
Upper_80 = forecast_deaths$upper[, 1],
Lower_95 = forecast_deaths$lower[, 2],
Upper_95 = forecast_deaths$upper[, 2]
)
# Plot the forecast with actual dates
ggplot() +
geom_line(data = global_data, aes(x = date, y = Total_New_Deaths, color = "Observed")) +
geom_line(data = forecast_deaths_df, aes(x = Date, y = Forecast, color = "Forecast")) +
geom_ribbon(data = forecast_deaths_df, aes(x = Date, ymin = Lower_95, ymax = Upper_95), fill = "blue", alpha = 0.2) +
geom_ribbon(data = forecast_deaths_df, aes(x = Date, ymin = Lower_80, ymax = Upper_80), fill = "lightblue", alpha = 0.4) +
ggtitle("ARIMA Forecast for Daily New Deaths Globally") +
xlab("Date") +
ylab("New Deaths") +
scale_y_continuous(labels = scales::comma) +
theme_minimal() +
theme(legend.title = element_blank())